1 MetabolicExpressR: Metabolic subtyping of patient tumors from gene expression data

This package aims to perform metabolic pathway-based subytping of patient tumor samples from gene expression data.

The main steps are:
1. Perform GSVA on cancer patient gene expression data using KEGG metabolic pathways.
2. Perform k-means clustering on the GSVA matrix, identify metabolic subtypes. Optimal number of k is defined based on data or can be user-specified.
3. Summarize pathway activity per cluster: i.e. mean pathway activity per cluster, or do differential testing.
4. Perform Kaplan-Meier (KM) analysis with clusters if survival data are present.

1.1 Introduction

…

1.2 Load libraries and test data

# Set wd to current location:
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

########## Load source codes and libraries

library("tidyverse")
library("fgsea")

source("../R/run_gsva_metabolic.R")
source("../R/kmeans_gsva_metabolic.R")
source("../R/kmeans_metab_clust_survival.R")
source("../R/plot_mean_pathway_activity.R")

########## Load data

# KEGG metabolic pathways gene set:
kegg_gs <- gmtPathways("../data/kegg_metabolic_human_20211026.gmt")

# CPTAC-HNSCC
gene_exp_cptac_hnscc <- read.table("../data/CPTAC_HNSCC/HS_CPTAC_HNSCC_RNAseq_RSEM_UQ_log2_Normal.cct",
    sep = "\t", stringsAsFactors = F, header = T) %>%
    column_to_rownames("Idx")
clinical_cptac_hnscc <- read.table("../data/CPTAC_HNSCC/HS_CPTAC_HNSCC_CLI.tsi",
    sep = "\t", header = T, stringsAsFactors = F)
clinical_cptac_hnscc <- clinical_cptac_hnscc[-1, ] %>%
    mutate(case_id = str_replace_all(case_id, "-", "\\."), overall_survival = as.numeric(overall_survival),
        overall_free_status = as.numeric(overall_free_status), progression_free_status = as.numeric(progression_free_status),
        progression_free_survival = as.numeric(progression_free_survival)) %>%
    filter(P16 != "Positive (>70% nuclear and cytoplasmic staining)" & case_id %in%
        colnames(gene_exp_cptac_hnscc))

# TCGA-ACC
gene_exp_tcga_acc <- read.table("../data/TCGA_ACC/ACC.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt",
    sep = "\t", stringsAsFactors = F, header = T)
gene_exp_tcga_acc <- gene_exp_tcga_acc[-1, ] %>%
    mutate(Hybridization.REF = str_replace(Hybridization.REF, "\\|\\d+$", "")) %>%
    filter(Hybridization.REF != "?" & Hybridization.REF != "SLC35E2") %>%
    column_to_rownames("Hybridization.REF") %>%
    mutate_if(is.character, as.numeric)
colnames(gene_exp_tcga_acc) <- colnames(gene_exp_tcga_acc) %>%
    str_extract("TCGA\\.[\\w\\d]{2}\\.[\\w\\d]{4}")
clinical_cptac_acc <- read.table("../data/TCGA_ACC/HS_CPTAC_ACC_CLI.tsi", sep = "\t",
    header = T, stringsAsFactors = F) %>%
    column_to_rownames("attrib_name") %>%
    t() %>%
    as.data.frame() %>%
    rownames_to_column("case_id")
clinical_cptac_acc <- clinical_cptac_acc %>%
    mutate(overall_survival = as.numeric(overall_survival), overall_free_status = as.numeric(status))

# TCGA-UVM
gene_exp_tcga_uvm <- read.table("../data/TCGA_UVM/UVM.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt",
    sep = "\t", stringsAsFactors = F, header = T)
gene_exp_tcga_uvm <- gene_exp_tcga_uvm[-1, ] %>%
    mutate(Hybridization.REF = str_replace(Hybridization.REF, "\\|\\d+$", "")) %>%
    filter(Hybridization.REF != "?" & Hybridization.REF != "SLC35E2") %>%
    column_to_rownames("Hybridization.REF") %>%
    mutate_if(is.character, as.numeric)
colnames(gene_exp_tcga_uvm) <- colnames(gene_exp_tcga_uvm) %>%
    str_extract("TCGA\\.[\\w\\d]{2}\\.[\\w\\d]{4}")
clinical_cptac_uvm <- read.table("../data/TCGA_UVM/HS_CPTAC_UVM_CLI.tsi", sep = "\t",
    header = T, stringsAsFactors = F) %>%
    column_to_rownames("attrib_name") %>%
    t() %>%
    as.data.frame() %>%
    rownames_to_column("case_id")
clinical_cptac_uvm <- clinical_cptac_uvm %>%
    mutate(overall_survival = as.numeric(overall_survival), overall_free_status = as.numeric(status))

# TCGA-LAML
gene_exp_tcga_laml <- read.table("../data/TCGA_LAML/LAML.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt",
    sep = "\t", stringsAsFactors = F, header = T)
gene_exp_tcga_laml <- gene_exp_tcga_laml[-1, ] %>%
    mutate(Hybridization.REF = str_replace(Hybridization.REF, "\\|\\d+$", "")) %>%
    filter(Hybridization.REF != "?" & Hybridization.REF != "SLC35E2") %>%
    column_to_rownames("Hybridization.REF") %>%
    mutate_if(is.character, as.numeric)
colnames(gene_exp_tcga_laml) <- colnames(gene_exp_tcga_laml) %>%
    str_extract("TCGA\\.[\\w\\d]{2}\\.[\\w\\d]{4}")
clinical_cptac_laml <- read.table("../data/TCGA_LAML/HS_CPTAC_LAML_CLI.tsi", sep = "\t",
    header = T, stringsAsFactors = F) %>%
    column_to_rownames("attrib_name") %>%
    t() %>%
    as.data.frame() %>%
    rownames_to_column("case_id")
clinical_cptac_laml <- clinical_cptac_laml %>%
    mutate(overall_survival = as.numeric(overall_survival), overall_free_status = as.numeric(status))

# CPTAC-COAD data:
gene_exp_cptac_coad <- read.table("../data/CPTAC_COAD/Human__CPTAC_COAD__UNC__RNAseq__HiSeq_RNA__03_01_2017__BCM__Gene__BCM_RSEM_UpperQuartile_log2.cct",
    sep = "\t", stringsAsFactors = F, header = T) %>%
    column_to_rownames("attrib_name")
# CPTAC-COAD: There is no survival days in survival data

# TCGA-COAD data:
gene_exp_tcga_coad <- read.table(gzfile("../data/TCGA_COAD/Human__TCGA_COADREAD__UNC__RNAseq__HiSeq_RNA__01_28_2016__BI__Gene__Firehose_RSEM_log2.cct.gz"),
    sep = "\t", stringsAsFactors = F, header = T) %>%
    column_to_rownames("attrib_name")
clinical_tcga_coad <- read.table("../data/TCGA_COAD/HS_CPTAC_TCGA_COAD_CLI.tsi",
    sep = "\t", header = T, stringsAsFactors = F) %>%
    column_to_rownames("attrib_name") %>%
    t() %>%
    as.data.frame() %>%
    rownames_to_column("case_id")
clinical_tcga_coad <- clinical_tcga_coad %>%
    mutate(overall_survival = as.numeric(overall_survival), overall_free_status = as.numeric(status))

1.3 Perforom GSVA on gene expression data

GSVA: gene set variation analysis for microarray and RNA-Seq data
This might take some time, especially for larger (n > 100) data sets.

# CPTAC-HNSC data:
gsva_metab_cptac_hnscc <- run_gsva_metabolic(gene_exp_cptac_hnscc, kcdf = "Gaussian",
    kegg_gs = kegg_gs)

# TCGA-ACC data:
gsva_metab_tcga_acc <- run_gsva_metabolic(gene_exp_tcga_acc, kcdf = "Gaussian", kegg_gs = kegg_gs)

# TCGA-UVM data:
gsva_metab_tcga_uvm <- run_gsva_metabolic(gene_exp_tcga_uvm, kcdf = "Gaussian", kegg_gs = kegg_gs)

# TCGA-LAML data:
gsva_metab_tcga_laml <- run_gsva_metabolic(gene_exp_tcga_laml, kcdf = "Gaussian",
    kegg_gs = kegg_gs)

# TCGA-COAD data:
gsva_metab_tcga_coad <- run_gsva_metabolic(gene_exp_tcga_coad, kcdf = "Gaussian",
    kegg_gs = kegg_gs)

# CPTAC-COAD data:
gsva_metab_cptac_coad <- run_gsva_metabolic(gene_exp_cptac_coad, kcdf = "Gaussian",
    kegg_gs = kegg_gs)

1.4 K-means clustering on the GSVA matrix

It is recommended to use the optimal number of k for the clustering step. This is selected based on several indices included in the NbClust R library. Otherwise, you can set the parameters user_def_k = TRUE and k = ... in the kmeans_gsva_metabolic() function.

# CPTAC-HNSC data:
kmeans_clusters_cptac_hnscc <- kmeans_gsva_metabolic(gsva_metab_cptac_hnscc, kegg_gs = kegg_gs)
# TCGA-ACC data:
kmeans_clusters_tcga_acc <- kmeans_gsva_metabolic(gsva_metab_tcga_acc, kegg_gs = kegg_gs)
# TCGA-UVM data:
kmeans_clusters_tcga_uvm <- kmeans_gsva_metabolic(gsva_metab_tcga_uvm, kegg_gs = kegg_gs)
# TCGA-LAML data:
kmeans_clusters_tcga_laml <- kmeans_gsva_metabolic(gsva_metab_tcga_laml, kegg_gs = kegg_gs)
# TCGA-COAD data:
kmeans_clusters_tcga_coad <- kmeans_gsva_metabolic(gsva_metab_tcga_coad, kegg_gs = kegg_gs,
    user_def_k = TRUE, k = 2)

# CPTAC-COAD data:
kmeans_clusters_cptac_coad <- kmeans_gsva_metabolic(gsva_metab_cptac_coad, kegg_gs = kegg_gs)
kmeans_clusters_cptac_hnscc$heatmap

kmeans_clusters_tcga_acc$heatmap

kmeans_clusters_tcga_uvm$heatmap

kmeans_clusters_tcga_laml$heatmap

kmeans_clusters_tcga_coad$heatmap

kmeans_clusters_cptac_coad$heatmap

1.5 Barplot of mean pathway activity per cluster

plot_mean_pathway_activity(gsva_data = gsva_metab_cptac_hnscc, kegg_gs = kegg_gs,
    kmeans_res = kmeans_clusters_cptac_hnscc$kmean_res)

plot_mean_pathway_activity(gsva_data = gsva_metab_tcga_acc, kegg_gs = kegg_gs, kmeans_res = kmeans_clusters_tcga_acc$kmean_res)

plot_mean_pathway_activity(gsva_data = gsva_metab_tcga_uvm, kegg_gs = kegg_gs, kmeans_res = kmeans_clusters_tcga_uvm$kmean_res)

plot_mean_pathway_activity(gsva_data = gsva_metab_tcga_laml, kegg_gs = kegg_gs, kmeans_res = kmeans_clusters_tcga_laml$kmean_res)

plot_mean_pathway_activity(gsva_data = gsva_metab_tcga_coad, kegg_gs = kegg_gs, kmeans_res = kmeans_clusters_tcga_coad$kmean_res)

plot_mean_pathway_activity(gsva_data = gsva_metab_cptac_coad, kegg_gs = kegg_gs,
    kmeans_res = kmeans_clusters_cptac_coad$kmean_res)

1.6 Kaplan-Meier analysis of the identified metabolic clusters

# CPTAC-HNSC data:
kmeans_metab_clust_surv(kmeans_res = kmeans_clusters_cptac_hnscc$kmean_res, clinical_data = clinical_cptac_hnscc,
    sample_id_col = "case_id", surv_status_col = "overall_free_status", surv_time_col = "overall_survival")

# TCGA-ACC data:
kmeans_metab_clust_surv(kmeans_res = kmeans_clusters_tcga_acc$kmean_res, clinical_data = clinical_cptac_acc,
    sample_id_col = "case_id", surv_status_col = "overall_free_status", surv_time_col = "overall_survival")

# TCGA-UVM data:
kmeans_metab_clust_surv(kmeans_res = kmeans_clusters_tcga_uvm$kmean_res, clinical_data = clinical_cptac_uvm,
    sample_id_col = "case_id", surv_status_col = "overall_free_status", surv_time_col = "overall_survival")

# TCGA-LAML data:
kmeans_metab_clust_surv(kmeans_res = kmeans_clusters_tcga_laml$kmean_res, clinical_data = clinical_cptac_laml,
    sample_id_col = "case_id", surv_status_col = "overall_free_status", surv_time_col = "overall_survival")

# TCGA-COAD data:
kmeans_metab_clust_surv(kmeans_res = kmeans_clusters_tcga_coad$kmean_res, clinical_data = clinical_tcga_coad,
    sample_id_col = "case_id", surv_status_col = "overall_free_status", surv_time_col = "overall_survival")

sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/Berlin
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] grid      parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] survminer_0.4.9       ggpubr_0.6.0          survival_3.5-7       
##  [4] NbClust_3.0.1         viridis_0.6.3         viridisLite_0.4.1    
##  [7] ComplexHeatmap_2.16.0 GSVA_1.48.0           fgsea_1.25.2         
## [10] lubridate_1.9.2       forcats_1.0.0         stringr_1.5.0        
## [13] dplyr_1.1.2           purrr_1.0.1           readr_2.1.4          
## [16] tidyr_1.3.0           tibble_3.2.1          ggplot2_3.4.2        
## [19] tidyverse_2.0.0       BiocStyle_2.28.1     
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3          rstudioapi_0.14            
##   [3] jsonlite_1.8.4              shape_1.4.6                
##   [5] magrittr_2.0.3              magick_2.8.0               
##   [7] farver_2.1.1                rmarkdown_2.21             
##   [9] GlobalOptions_0.1.2         zlibbioc_1.45.0            
##  [11] vctrs_0.6.2                 memoise_2.0.1              
##  [13] Cairo_1.6-1                 DelayedMatrixStats_1.22.0  
##  [15] RCurl_1.98-1.12             rstatix_0.7.2              
##  [17] htmltools_0.5.5             S4Arrays_1.0.4             
##  [19] broom_1.0.4                 Rhdf5lib_1.22.0            
##  [21] rhdf5_2.44.0                sass_0.4.5                 
##  [23] bslib_0.4.2                 zoo_1.8-12                 
##  [25] cachem_1.0.7                commonmark_1.9.0           
##  [27] lifecycle_1.0.3             iterators_1.0.14           
##  [29] pkgconfig_2.0.3             rsvd_1.0.5                 
##  [31] Matrix_1.6-1.1              R6_2.5.1                   
##  [33] fastmap_1.1.1               GenomeInfoDbData_1.2.10    
##  [35] MatrixGenerics_1.11.1       clue_0.3-64                
##  [37] digest_0.6.31               colorspace_2.1-0           
##  [39] AnnotationDbi_1.61.2        S4Vectors_0.37.7           
##  [41] irlba_2.3.5.1               GenomicRanges_1.51.4       
##  [43] RSQLite_2.3.1               beachmat_2.16.0            
##  [45] labeling_0.4.2              km.ci_0.5-6                
##  [47] fansi_1.0.4                 timechange_0.2.0           
##  [49] abind_1.4-5                 httr_1.4.5                 
##  [51] compiler_4.3.1              bit64_4.0.5                
##  [53] withr_2.5.0                 doParallel_1.0.17          
##  [55] backports_1.4.1             BiocParallel_1.33.12       
##  [57] carData_3.0-5               DBI_1.1.3                  
##  [59] highr_0.10                  HDF5Array_1.28.1           
##  [61] ggsignif_0.6.4              DelayedArray_0.26.3        
##  [63] rjson_0.2.21                tools_4.3.1                
##  [65] glue_1.6.2                  rhdf5filters_1.12.1        
##  [67] gridtext_0.1.5              cluster_2.1.4              
##  [69] generics_0.1.3              gtable_0.3.3               
##  [71] KMsurv_0.1-5                tzdb_0.3.0                 
##  [73] data.table_1.14.8           hms_1.1.3                  
##  [75] xml2_1.3.3                  car_3.1-2                  
##  [77] BiocSingular_1.16.0         ScaledMatrix_1.8.1         
##  [79] utf8_1.2.3                  XVector_0.39.0             
##  [81] BiocGenerics_0.45.3         markdown_1.6               
##  [83] foreach_1.5.2               pillar_1.9.0               
##  [85] splines_4.3.1               circlize_0.4.15            
##  [87] ggtext_0.1.2                lattice_0.21-9             
##  [89] bit_4.0.5                   annotate_1.77.0            
##  [91] tidyselect_1.2.0            SingleCellExperiment_1.22.0
##  [93] Biostrings_2.67.2           knitr_1.42                 
##  [95] gridExtra_2.3               bookdown_0.34              
##  [97] IRanges_2.33.1              SummarizedExperiment_1.29.1
##  [99] stats4_4.3.1                xfun_0.39                  
## [101] Biobase_2.59.0              matrixStats_0.63.0         
## [103] stringi_1.7.12              yaml_2.3.7                 
## [105] evaluate_0.20               codetools_0.2-19           
## [107] BiocManager_1.30.20         graph_1.78.0               
## [109] cli_3.6.1                   xtable_1.8-4               
## [111] munsell_0.5.0               jquerylib_0.1.4            
## [113] survMisc_0.5.6              Rcpp_1.0.10                
## [115] GenomeInfoDb_1.35.17        png_0.1-8                  
## [117] XML_3.99-0.14               blob_1.2.4                 
## [119] sparseMatrixStats_1.12.0    bitops_1.0-7               
## [121] GSEABase_1.62.0             scales_1.2.1               
## [123] crayon_1.5.2                GetoptLong_1.0.5           
## [125] rlang_1.1.0                 cowplot_1.1.1              
## [127] fastmatch_1.1-3             KEGGREST_1.39.0            
## [129] formatR_1.14